﻿using System;
using System.Collections.Generic;
using System.Data;
using System.Text;
using System.Drawing;
using Common.Utilities;

namespace PdfLib.TextScraping
{
    /// <summary>
    /// 
    /// </summary>
    public class TableDetector
    {
        /// <summary>
        /// relative to font size
        /// </summary>
        public const double RelativeRowSpacing = 0.5;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="pdfFilePath"></param>
        /// <param name="pageNum"></param>
        /// <param name="colLayouts"></param>
        /// <param name="rowDetector"></param>
        /// <returns></returns>
        public static DataTable ReadTableData(
            string pdfFilePath, int pageNum,
            Dictionary<string, RectangleF> colLayouts,
            TableRowDetectorInstruction rowDetector)
        {
            List<string> nonNullFields=new List<string>();
            Dictionary<int, Range<float>> rowRanges = TableRowDetector.DetectTableRows(
                pdfFilePath, pageNum, colLayouts, nonNullFields, rowDetector);
            DataTable dtOutput=new DataTable();
            foreach(string colName in colLayouts.Keys)
            {
                dtOutput.Columns.Add(colName, typeof (string));
            }
            List<PdfTextToken> txtTokens = PdfTextReader.ReadPageContentAsTokens(pdfFilePath, pageNum);
            txtTokens.Sort();
            foreach(int rowNum in rowRanges.Keys)
            {
                Range<float> rowRange = rowRanges[rowNum];
                DataRow dr = dtOutput.NewRow();
                for(int i=0;i<txtTokens.Count;i++)
                {
                    if(txtTokens[i].Position.Y <rowRange.FromValue)
                    {
                        continue;
                    }
                    else if(txtTokens[i].Position.Y > rowRange.ToValue)
                    {
                        break;
                    }
                    else
                    {
                        foreach(string colName in colLayouts.Keys)
                        {
                            RectangleF colRect = colLayouts[colName];
                            Range<float> colRange=new Range<float>();
                            colRange.FromValue = colRect.X;
                            colRange.ToValue = colRect.X + colRect.Width;
                            Range<float> txtTokenRange=new Range<float>();
                            txtTokenRange.FromValue = txtTokens[i].TextRect.X;
                            txtTokenRange.ToValue = txtTokens[i].TextRect.X + txtTokens[i].TextRect.Width;
                            if(colRange.OverlapWith(txtTokenRange))
                            {
                                if(dr[colName]==null)
                                {
                                    dr[colName] = txtTokens[i].Text;
                                }
                                else
                                {
                                    dr[colName] += " " + txtTokens[i].Text;
                                }
                                break;
                            }
                        }
                    }
                }
                dtOutput.Rows.Add(dr);
            }
            return dtOutput;
        }

        /// <summary>
        /// return list of page numbers that contains table
        /// the detection is in  the order header --> footer --> columns
        /// and it must satisfy all criteria
        /// </summary>
        /// <param name="pdfFilePath"></param>
        /// <param name="headerDetector"></param>
        /// <param name="footerDetector"></param>
        /// <param name="colDetector"></param>
        /// <returns></returns>
        //public static List<int> ScanTables(string pdfFilePath, 
        //    TableHeaderDetectorInstruction headerDetector, 
        //    TableFooterDetectorInstruction footerDetector, 
        //    TableColumnDetectorInstruction colDetector)
        //{
        //    int pageCount = PdfPropertyReader.GetPageCount(pdfFilePath);
        //    List<int> pagesContainingTable = new List<int>();
        //    for (int pageNum = 1; pageNum <= pageCount; pageNum++)
        //    {
        //        double headerSearchScore = TableHeaderDetector.DetectTableHeader(pdfFilePath, pageNum, headerDetector);
        //        if (headerSearchScore >= headerDetector.MatchScoreThreshold)
        //        {
        //            double footerSearchScore = TableFooterDetector.DetectTableFooter(pdfFilePath, pageNum, footerDetector);
        //            if (footerSearchScore >= footerDetector.MatchScoreThreshold)
        //            {
        //                double colSearchScore = TableColumnDetector.DetectTableColumns(pdfFilePath, pageNum, colDetector);
        //                if (colSearchScore >= colDetector.MatchScoreThreshold)
        //                {
        //                    pagesContainingTable.Add(pageNum);
        //                }
        //            }
        //        }
        //    }
        //    return pagesContainingTable;
        //}
    }
}
